import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re
import nltk

from tqdm import trange
from nltk import tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer


import warnings
warnings.filterwarnings('ignore')
nltk.download('omw-1.4', quiet=True)
sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = (17,7)
plt.rcParams['font.size'] = 18


data = pd.read_csv('data.csv')
data.head(10)


data['Rating'].value_counts()

5    9054
4    6039
3    2184
2    1793
1    1421
Name: Rating, dtype: int64


# rating 4, 5 => Positive; 1, 2, 3 => Negative
def ratings(rating):
    if rating>3 and rating<=5:
        return "Positive"
    if rating>0 and rating<=3:
        return "Negative"


data['Rating'] = data['Rating'].apply(ratings)
plt.pie(data['Rating'].value_counts(), labels=data['Rating'].unique().tolist(), autopct='%1.1f%%')
plt.show()


lenght = len(data['Review'][0])
print(f'Length of a sample review: {lenght}')

Length of a sample review: 593


data['Length'] = data['Review'].str.len()
data.head(10)


word_count = data['Review'][0].split()
print(f'Word count in a sample review: {len(word_count)}')

Word count in a sample review: 87


def word_count(review):
    review_list = review.split()
    return len(review_list)


data['Word_count'] = data['Review'].apply(word_count)
data.head(10)


data['mean_word_length'] = data['Review'].map(lambda rev: np.mean([len(word) for word in rev.split()]))
data.head(10)


nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!

True


np.mean([len(sent) for sent in tokenize.sent_tokenize(data['Review'][0])])

591.0


data['mean_sent_length'] = data['Review'].map(lambda rev: np.mean([len(sent) for sent in tokenize.sent_tokenize(rev)]))
data.head(10)


def visualize(col):
    
    print()
    plt.subplot(1,2,1)
    sns.boxplot(y=data[col], hue=data['Rating'])
    plt.ylabel(col, labelpad=12.5)
    
    plt.subplot(1,2,2)
    # sns.kdeplot(data[col], hue=data['Rating'])
    sns.displot(data=data, x=col, hue=data['Rating'], kind="kde")
    plt.legend(data['Rating'].unique())
    plt.xlabel('')
    plt.ylabel('')
    
    plt.show()


features = data.columns.tolist()[2:]
for feature in features:
    visualize(feature)


df = data.drop(features, axis=1)
df.head()


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20491 entries, 0 to 20490
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  20491 non-null  object
 1   Rating  20491 non-null  object
dtypes: object(2)
memory usage: 320.3+ KB


def clean(review):
    
    review = review.lower()
    review = re.sub('[^a-z A-Z 0-9-]+', '', review)
    review = " ".join([word for word in review.split() if word not in stopwords.words('english')])
    
    return review


nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.

True


df['Review'] = df['Review'].apply(clean)
df.head(10)


df['Review'][0]

'nice hotel expensive parking got good deal stay hotel anniversary arrived late evening took advice previous reviews valet parking check quick easy little disappointed non-existent view room room clean nice size bed comfortable woke stiff neck high pillows soundproof like heard music room night morning loud bangs doors opening closing hear people talking hallway maybe noisy neighbors aveda bath products nice goldfish stay nice touch taken advantage staying longer location great walking distance shopping overall nice experience pay 40 parking night'


def corpus(text):
    text_list = text.split()
    return text_list


df['Review_lists'] = df['Review'].apply(corpus)
df.head(10)


corpus = []
for i in trange(df.shape[0], ncols=150, nrows=10, colour='green', smoothing=0.8):
    corpus += df['Review_lists'][i]
len(corpus)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 20491/20491 [00:00<00:00, 74595.55it/s]

2060911


mostCommon = Counter(corpus).most_common(10)
mostCommon

[('hotel', 48844),
 ('room', 34279),
 ('great', 21068),
 ('nt', 18989),
 ('good', 16953),
 ('staff', 16193),
 ('stay', 15142),
 ('nice', 12393),
 ('rooms', 12005),
 ('location', 11009)]


words = []
freq = []
for word, count in mostCommon:
    words.append(word)
    freq.append(count)


sns.barplot(x=freq, y=words)
plt.title('Top 10 Most Frequently Occuring Words')
plt.show()


cv = CountVectorizer(ngram_range=(2,2))
bigrams = cv.fit_transform(df['Review'])


count_values = bigrams.toarray().sum(axis=0)
ngram_freq = pd.DataFrame(sorted([(count_values[i], k) for k, i in cv.vocabulary_.items()], reverse = True))
ngram_freq.columns = ["frequency", "ngram"]


sns.barplot(x=ngram_freq['frequency'][:10], y=ngram_freq['ngram'][:10])
plt.title('Top 10 Most Frequently Occuring Bigrams')
plt.show()


cv1 = CountVectorizer(ngram_range=(3,3))
trigrams = cv1.fit_transform(df['Review'])
count_values = trigrams.toarray().sum(axis=0)
ngram_freq = pd.DataFrame(sorted([(count_values[i], k) for k, i in cv1.vocabulary_.items()], reverse = True))
ngram_freq.columns = ["frequency", "ngram"]


sns.barplot(x=ngram_freq['frequency'][:10], y=ngram_freq['ngram'][:10])
plt.title('Top 10 Most Frequently Occuring Trigrams')
plt.show()

	Review	Rating
0	nice hotel expensive parking got good deal sta...	4
1	ok nothing special charge diamond member hilto...	2
2	nice rooms not 4* experience hotel monaco seat...	3
3	unique, great stay, wonderful time hotel monac...	5
4	great stay great stay, went seahawk game aweso...	5
5	love monaco staff husband stayed hotel crazy w...	5
6	cozy stay rainy city, husband spent 7 nights m...	5
7	excellent staff, housekeeping quality hotel ch...	4
8	hotel stayed hotel monaco cruise, rooms genero...	5
9	excellent stayed hotel monaco past w/e delight...	5

	Review	Rating	Length
0	nice hotel expensive parking got good deal sta...	Positive	593
1	ok nothing special charge diamond member hilto...	Negative	1689
2	nice rooms not 4* experience hotel monaco seat...	Negative	1427
3	unique, great stay, wonderful time hotel monac...	Positive	600
4	great stay great stay, went seahawk game aweso...	Positive	1281
5	love monaco staff husband stayed hotel crazy w...	Positive	1002
6	cozy stay rainy city, husband spent 7 nights m...	Positive	748
7	excellent staff, housekeeping quality hotel ch...	Positive	597
8	hotel stayed hotel monaco cruise, rooms genero...	Positive	419
9	excellent stayed hotel monaco past w/e delight...	Positive	271

	Review	Rating	Length	Word_count
0	nice hotel expensive parking got good deal sta...	Positive	593	87
1	ok nothing special charge diamond member hilto...	Negative	1689	250
2	nice rooms not 4* experience hotel monaco seat...	Negative	1427	217
3	unique, great stay, wonderful time hotel monac...	Positive	600	89
4	great stay great stay, went seahawk game aweso...	Positive	1281	191
5	love monaco staff husband stayed hotel crazy w...	Positive	1002	134
6	cozy stay rainy city, husband spent 7 nights m...	Positive	748	101
7	excellent staff, housekeeping quality hotel ch...	Positive	597	85
8	hotel stayed hotel monaco cruise, rooms genero...	Positive	419	59
9	excellent stayed hotel monaco past w/e delight...	Positive	271	35

	Review	Rating	Length	Word_count	mean_word_length
0	nice hotel expensive parking got good deal sta...	Positive	593	87	5.804598
1	ok nothing special charge diamond member hilto...	Negative	1689	250	5.752000
2	nice rooms not 4* experience hotel monaco seat...	Negative	1427	217	5.571429
3	unique, great stay, wonderful time hotel monac...	Positive	600	89	5.730337
4	great stay great stay, went seahawk game aweso...	Positive	1281	191	5.701571
5	love monaco staff husband stayed hotel crazy w...	Positive	1002	134	6.470149
6	cozy stay rainy city, husband spent 7 nights m...	Positive	748	101	6.396040
7	excellent staff, housekeeping quality hotel ch...	Positive	597	85	6.011765
8	hotel stayed hotel monaco cruise, rooms genero...	Positive	419	59	6.084746
9	excellent stayed hotel monaco past w/e delight...	Positive	271	35	6.714286

	Review	Rating	Length	Word_count	mean_word_length	mean_sent_length
0	nice hotel expensive parking got good deal sta...	Positive	593	87	5.804598	591.0
1	ok nothing special charge diamond member hilto...	Negative	1689	250	5.752000	1687.0
2	nice rooms not 4* experience hotel monaco seat...	Negative	1427	217	5.571429	712.0
3	unique, great stay, wonderful time hotel monac...	Positive	600	89	5.730337	598.0
4	great stay great stay, went seahawk game aweso...	Positive	1281	191	5.701571	1279.0
5	love monaco staff husband stayed hotel crazy w...	Positive	1002	134	6.470149	1000.0
6	cozy stay rainy city, husband spent 7 nights m...	Positive	748	101	6.396040	746.0
7	excellent staff, housekeeping quality hotel ch...	Positive	597	85	6.011765	595.0
8	hotel stayed hotel monaco cruise, rooms genero...	Positive	419	59	6.084746	417.0
9	excellent stayed hotel monaco past w/e delight...	Positive	271	35	6.714286	269.0

EDA for Natural Language Processing¶

Setup¶

Loading the Data¶

Exploratory Data Analysis¶

Counts and Lenght:¶

Word Count: Number of words in a review¶

Mean word length: Average length of words¶

Mean sentence length: Average length of the sentences in the review¶

Term Frequency Analysis¶

Most Frequently occuring N_grams¶

	Review	Rating	Review_lists
0	nice hotel expensive parking got good deal sta...	Positive	[nice, hotel, expensive, parking, got, good, d...
1	ok nothing special charge diamond member hilto...	Negative	[ok, nothing, special, charge, diamond, member...
2	nice rooms 4 experience hotel monaco seattle g...	Negative	[nice, rooms, 4, experience, hotel, monaco, se...
3	unique great stay wonderful time hotel monaco ...	Positive	[unique, great, stay, wonderful, time, hotel, ...
4	great stay great stay went seahawk game awesom...	Positive	[great, stay, great, stay, went, seahawk, game...
5	love monaco staff husband stayed hotel crazy w...	Positive	[love, monaco, staff, husband, stayed, hotel, ...
6	cozy stay rainy city husband spent 7 nights mo...	Positive	[cozy, stay, rainy, city, husband, spent, 7, n...
7	excellent staff housekeeping quality hotel cho...	Positive	[excellent, staff, housekeeping, quality, hote...
8	hotel stayed hotel monaco cruise rooms generou...	Positive	[hotel, stayed, hotel, monaco, cruise, rooms, ...
9	excellent stayed hotel monaco past delight rec...	Positive	[excellent, stayed, hotel, monaco, past, delig...